https://kforthman.shinyapps.io/500citiescounties/
#
#remove scientific notation
options(scipen=999)
library(stringr)
library(corrplot)
## corrplot 0.84 loaded
library(shiny)
library(lme4)
## Loading required package: Matrix
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
load("Data/county_factors.rda")
load("Data/county_500CitiesData.rda")
data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/"
# Read in the data
US.deaths <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = T, stringsAsFactors = F)
US.cases <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = T, stringsAsFactors = F)
# Read in the header seprately.
US.cases.head <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = F, stringsAsFactors = F)[1,]
US.deaths.head <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = F, stringsAsFactors = F)[1,]
# Correct the dates in the header to be more useable as
# column names.
proper_date <- function(dates){
dates <- sapply(dates, strsplit, split = "/")
dates <- lapply(dates, str_pad, width = 2, side = "left", pad = "0")
dates <- lapply(dates, paste, collapse = "_")
dates <- unlist(dates)
return(dates)
}
dates.cases <- proper_date(US.cases.head[-c(1:11)])
dates.deaths <- proper_date(US.deaths.head[-c(1:12)])
names(US.cases) <- c(US.cases.head[1,1:11], dates.cases)
names(US.deaths) <- c(US.deaths.head[1,1:12], dates.deaths)
if(sum(US.cases$UID != US.deaths$UID, na.rm = T) > 0){warning("COVID data rows do not match!")}
US.cases$Population <- US.deaths$Population
US.cases <- US.cases[,c(1:11, ncol(US.cases), 12:(ncol(US.cases)-1))]
data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/"
daily_filenames <- list.files(data.path)
daily_filenames <- daily_filenames[daily_filenames != "README.md"]
todays_report_filename <- daily_filenames[length(daily_filenames)]
US.todaysReport <- read.csv(
paste0(data.path, todays_report_filename),
header = T, stringsAsFactors = F)
all.states <- c('Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Diamond Princess', 'District of Columbia', 'Florida', 'Georgia', 'Grand Princess', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming')
all.states.df <- data.frame(Province_State = all.states)
all.stats <- c("Confirmed", "Deaths", "Recovered", "Active", "Incident_Rate", "People_Tested", "People_Hospitalized", "Mortality_Rate", "Testing_Rate", "Hospitalization_Rate")
compiled.stats <- list()
for(i in 1:length(daily_filenames)){
day <- substring(daily_filenames[i],1,10)
data <- read.csv(
paste0(data.path, daily_filenames[i]),
header = T, stringsAsFactors = F)
compiled.stats[[i]] <- merge(all.states.df, data, all.y = F)
names(compiled.stats)[i] <- day
}
plot.dailyStat <- function(state, stat){
data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
names(data) <- daily_filenames
barplot(data, main = paste0(state, " ", stat), las = 2, cex.axis = 1, cex.names = 0.5)
}
plot.dailyStatRise <- function(state, stat){
data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
names(data) <- daily_filenames
rise.stat <- matrix(ncol = length(data) - 1, nrow = 1)
colnames(rise.stat) <- names(data)[-1]
for(i in 1:ncol(rise.stat) + 1){
rise <- data[i] - data[i-1]
rise.stat[i-1] <- rise
}
barplot(rise.stat, main = paste0(state, " rise in ",stat), las = 2, cex.axis = 1, cex.names = 0.5)
}
testing.data.state <- compiled.stats[[length(daily_filenames)]][, c("Province_State", "Testing_Rate")]
testing.data.state <- testing.data.state[!is.na(testing.data.state$Testing_Rate),]
testing.data.state <- testing.data.state[order(testing.data.state$Testing_Rate),]
col.state <- rep("pink", nrow(testing.data.state))
avg.test.rate <- mean(testing.data.state$Testing_Rate, na.rm = T)
col.state[testing.data.state$Testing_Rate < avg.test.rate] <- "grey"
col.state[testing.data.state$Province_State == "Oklahoma"] <- "lightblue"
par(mar = c(5,6,4,2))
barplot(testing.data.state$Testing_Rate, names.arg = testing.data.state$Province_State, horiz = T, main = "Testing Rate by State", las = 2, cex.axis = 1, cex.names = 0.5, col = col.state, border = F, xlab = "Total number of people tested per 100,000 persons.")
abline(v = avg.test.rate, col = "red")
text(x = avg.test.rate + 10, y = 1, labels = "Average Testing Rate", adj = c(0, 0.5), col = "red")
Province_State - The name of the State within the USA. Country_Region - The name of the Country (US). Last_Update - The most recent date the file was pushed. Lat - Latitude. Long_ - Longitude. Confirmed - Aggregated confirmed case count for the state. Deaths - Aggregated Death case count for the state. Recovered - Aggregated Recovered case count for the state. Active - Aggregated confirmed cases that have not been resolved (Active = Confirmed - Recovered - Deaths). FIPS - Federal Information Processing Standards code that uniquely identifies counties within the USA. Incident_Rate - confirmed cases per 100,000 persons. People_Tested - Total number of people who have been tested. People_Hospitalized - Total number of people hospitalized. Mortality_Rate - Number recorded deaths * 100/ Number confirmed cases. UID - Unique Identifier for each row entry. ISO3 - Officialy assigned country code identifiers. Testing_Rate - Total number of people tested per 100,000 persons. Hospitalization_Rate - Total number of people hospitalized * 100/ Number of confirmed cases.
US.cases.info <- as.matrix(US.cases[,1:12])
US.cases.data <- as.matrix(US.cases[,-c(2:12)])
US.deaths.info <- as.matrix(US.deaths[,1:12])
US.deaths.data <- as.matrix(US.deaths[,-c(2:12)])
rownames(US.cases.info) <- US.cases.info[,1]
US.cases.info <- US.cases.info[,-1]
rownames(US.cases.data) <- US.cases.data[,1]
US.cases.data <- US.cases.data[,-1]
rownames(US.deaths.info) <- US.deaths.info[,1]
US.deaths.info <- US.deaths.info[,-1]
rownames(US.deaths.data) <- US.deaths.data[,1]
US.deaths.data <- US.deaths.data[,-1]
ndays.cases <- ncol(US.cases.data)
ndays.deaths <- ncol(US.deaths.data)
nobs <- nrow(US.cases.data)
state.curve <- function(state, stat = c("cases", "deaths"), logScale = T){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Province_State == state),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Province_State == state),]
}
data.sum <- colSums(data)
day.first.case <- min(which(data.sum > 0))
n.days <- length(data.sum)
if(logScale == T){
barplot(data.sum[day.first.case:n.days],
main = paste0("Total COVID-19 ", stat," by date in ", state, ", log scale"),
log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}else{
barplot(data.sum[day.first.case:n.days],
main = paste0("Total COVID-19 ", stat," by date in ", state),
las = 2, cex.axis = 1, cex.names = 0.5)
}
}
state.rise <- function(state, stat = c("cases", "deaths")){
if(stat == "cases"){
data.thisState <- US.cases.data[which(US.cases$Province_State == state),]
}else if(stat == "deaths"){
data.thisState <- US.deaths.data[which(US.deaths$Province_State == state),]
}
data.sum <- colSums(data.thisState)
n.days <- ncol(data.thisState)
rise.cases <- matrix(ncol = n.days - 1, nrow = 1)
colnames(rise.cases) <- colnames(data.thisState)[-1]
for(i in 1:ncol(rise.cases) + 1){
rise <- data.sum[i] - data.sum[i-1]
rise.cases[i-1] <- rise
}
day.first.case <- min(which(rise.cases > 0))
n.days <- length(rise.cases)
barplot(rise.cases[,day.first.case:n.days], main = paste0("Rise in COVID-19 ", stat, " by Date in ", state), las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve <- function(county, stat = c("cases", "deaths")){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Admin2 == county),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Admin2 == county),]
}
day.first.case <- min(which(data > 0))
n.days <- length(data)
barplot(data[day.first.case:n.days], main = paste0("Total COVID-19 ", stat," by date in ", county), log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve("Tulsa", "cases")
county.curve("Tulsa", "deaths")
US.stats <- data.frame(UID = US.cases$UID)
cases.total <- colSums(US.cases.data)
day.first.case <- min(which(cases.total > 100))
n.days <- length(cases.total)
par(mar = c(5,5,4,2))
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
deaths.total <- colSums(US.deaths.data)
day.first.case <- min(which(deaths.total > 0))
n.days <- length(deaths.total)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
avg.rise.cases
rise.cases <- matrix(ncol = ndays.cases - 1, nrow = nobs)
colnames(rise.cases) <- colnames(US.cases.data)[-1]
for(i in 1:ncol(rise.cases) + 1){
rise <- US.cases.data[,i] - US.cases.data[,i-1]
rise.cases[,i-1] <- rise
}
US.stats$avg.rise.cases <- apply(rise.cases, 1, mean)
rise.cases.total <- colSums(rise.cases)
day.first.case <- min(which(rise.cases.total > 0))
n.days <- length(rise.cases.total)
barplot(rise.cases.total[day.first.case:n.days], main = "Rise in Cases of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
avg.rise.deaths
rise.deaths <- matrix(ncol = ndays.deaths - 1, nrow = nobs)
colnames(rise.deaths) <- colnames(US.deaths.data)[-1]
for(i in 1:ncol(rise.deaths) + 1){
rise <- US.deaths.data[,i] - US.deaths.data[,i-1]
rise.deaths[,i-1] <- rise
}
US.stats$avg.rise.deaths <- apply(rise.deaths, 1, mean)
rise.deaths.total <- colSums(rise.deaths)
day.first.case <- min(which(rise.deaths.total > 0))
n.days <- length(rise.deaths.total)
barplot(rise.deaths.total[day.first.case:n.days], main = "Rise in Deaths of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
total.cases
US.stats$total.cases <- US.cases.data[,ndays.cases]
US.stats$total.cases.percap <- US.stats$total.cases / US.cases$Population
US.stats$total.cases.percap[US.cases$Population == 0] <- NA
hist(US.stats$total.cases.percap)
total.deaths
US.stats$total.deaths <- US.deaths.data[,ndays.deaths]
total.deaths.percap
US.stats$total.deaths.percap <- US.stats$total.deaths / US.deaths$Population
US.stats$total.deaths.percap[US.deaths$Population == 0] <- NA
max(US.stats$total.deaths.percap,na.rm = T)
## [1] 0.003018157
total.deaths.percase Error in Johns Hopkins data has rows with total.deaths > total.cases.
# pos.case.ind <- US.stats$total.cases > 0
# US.stats$total.deaths.percase[pos.case.ind] <- US.stats$total.deaths[pos.case.ind] / US.stats$total.cases[pos.case.ind]
# US.stats$total.deaths.percase[!pos.case.ind] <- 0
US.stats$total.deaths.percase <- US.stats$total.deaths / US.stats$total.cases
US.stats$total.deaths.percase[US.stats$total.cases == 0] <- NA
US.stats[which(US.stats$total.deaths > US.stats$total.cases),]
## UID avg.rise.cases avg.rise.deaths total.cases
## 3155 84080008 0.0000000 0.02083333 0
## 3203 84090002 0.0000000 0.04166667 0
## 3206 84090006 0.0000000 0.02083333 0
## 3222 84090024 0.0000000 0.90625000 0
## 3230 84090032 0.0000000 0.01041667 0
## 3231 84090033 0.1145833 0.59375000 11
## 3252 84090056 0.0000000 0.06250000 0
## total.cases.percap total.deaths total.deaths.percap
## 3155 NA 2 NA
## 3203 NA 4 NA
## 3206 NA 2 NA
## 3222 NA 87 NA
## 3230 NA 1 NA
## 3231 NA 57 NA
## 3252 NA 6 NA
## total.deaths.percase
## 3155 NA
## 3203 NA
## 3206 NA
## 3222 NA
## 3230 NA
## 3231 5.181818
## 3252 NA
US.stats$ID <- str_pad(US.stats$UID, 8, "left", pad = "0")
US.stats$ID <- substr(US.stats$ID, 4, 8)
data.merge <- merge(US.stats, county_factors, by = "ID")
data.cor <- cor(data.merge[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
data.merge2 <- merge(data.merge, county_500CitiesData, by = "ID", all.x = F)
data.cor2 <- cor(data.merge2[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor2, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
corrplot.mixed(data.cor2[1:7,8:42], upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
US.todaysReport.states <- US.todaysReport[!is.na(US.todaysReport$FIPS) & nchar(US.todaysReport$FIPS)<=2,]
US.todaysReport.states$FIPS <- str_pad(US.todaysReport.states$FIPS, 2, "left", pad = "0")
data.merge2$stateID <- substr(data.merge2$ID,1,2)
data.merge3 <- merge(data.merge2, US.todaysReport.states, by.x = "stateID", by.y = "FIPS")
this.lme <- lmer("total.cases.percap ~ Affluence + Singletons.in.Tract + Seniors.in.Tract + African.Americans.in.Tract + Noncitizens.in.Tract + High.BP + Binge.Drinking + Cancer + Asthma + Heart.Disease + COPD + Smoking + Diabetes + No.Physical.Activity + Obesity + Poor.Sleeping.Habits + Poor.Mental.Health + Testing_Rate + Hospitalization_Rate + (1 | stateID)", data = data.merge3)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning: Some predictor variables are on very different scales: consider
## rescaling
print(summary(this.lme), correlation=TRUE)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## "total.cases.percap ~ Affluence + Singletons.in.Tract + Seniors.in.Tract + African.Americans.in.Tract + Noncitizens.in.Tract + High.BP + Binge.Drinking + Cancer + Asthma + Heart.Disease + COPD + Smoking + Diabetes + No.Physical.Activity + Obesity + Poor.Sleeping.Habits + Poor.Mental.Health + Testing_Rate + Hospitalization_Rate + (1 | stateID)"
## Data: data.merge3
##
## REML criterion at convergence: -2542.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.3750 -0.3734 -0.0561 0.2348 7.5446
##
## Random effects:
## Groups Name Variance Std.Dev.
## stateID (Intercept) 0.000003224 0.001796
## Residual 0.000007298 0.002702
## Number of obs: 322, groups: stateID, 49
##
## Fixed effects:
## Estimate Std. Error df
## (Intercept) -0.0092729370 0.0059555856 168.6645984151
## Affluence 0.0020381653 0.0005475543 287.9930549318
## Singletons.in.Tract 0.0011147099 0.0005113381 300.4519750093
## Seniors.in.Tract 0.0005746364 0.0006468216 300.6483458645
## African.Americans.in.Tract 0.0011286671 0.0006218167 301.9945354073
## Noncitizens.in.Tract 0.0008019220 0.0004964915 246.3760681646
## High.BP -0.0000350011 0.0001118935 277.0605960621
## Binge.Drinking 0.0002269260 0.0001147227 130.1982352872
## Cancer -0.0005074758 0.0006506449 239.3122854064
## Asthma 0.0000118199 0.0003954767 145.1888987285
## Heart.Disease 0.0020196390 0.0008408226 179.2663648112
## COPD -0.0005245409 0.0006555860 188.3933609761
## Smoking -0.0002375821 0.0001455541 210.1095405826
## Diabetes -0.0005367782 0.0003202410 253.4553392772
## No.Physical.Activity 0.0000794660 0.0001257329 211.7685883509
## Obesity 0.0001439090 0.0001043171 300.4259061436
## Poor.Sleeping.Habits 0.0002298601 0.0000997605 284.8393154611
## Poor.Mental.Health -0.0000584112 0.0003201366 87.1119382760
## Testing_Rate 0.0000006171 0.0000003860 48.5978168936
## Hospitalization_Rate -0.0000911451 0.0000529007 36.1705213472
## t value Pr(>|t|)
## (Intercept) -1.557 0.121341
## Affluence 3.722 0.000237 ***
## Singletons.in.Tract 2.180 0.030034 *
## Seniors.in.Tract 0.888 0.375036
## African.Americans.in.Tract 1.815 0.070499 .
## Noncitizens.in.Tract 1.615 0.107552
## High.BP -0.313 0.754662
## Binge.Drinking 1.978 0.050036 .
## Cancer -0.780 0.436186
## Asthma 0.030 0.976198
## Heart.Disease 2.402 0.017328 *
## COPD -0.800 0.424655
## Smoking -1.632 0.104122
## Diabetes -1.676 0.094938 .
## No.Physical.Activity 0.632 0.528054
## Obesity 1.380 0.168756
## Poor.Sleeping.Habits 2.304 0.021936 *
## Poor.Mental.Health -0.182 0.855648
## Testing_Rate 1.599 0.116407
## Hospitalization_Rate -1.723 0.093439 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of fixed effects could have been required in summary()
##
## Correlation of Fixed Effects:
## (Intr) Afflnc Sng..T Snr..T A.A..T Nnc..T Hgh.BP Bng.Dr Cancer
## Affluence -0.059
## Sngltns.n.T -0.056 0.042
## Snrs.n.Trct 0.407 0.291 0.073
## Afrcn.Am..T 0.243 0.074 -0.401 0.202
## Nnctzns.n.T -0.073 0.153 0.126 0.057 -0.182
## High.BP -0.113 0.162 0.093 0.010 -0.247 0.343
## Bing.Drnkng -0.443 -0.071 -0.213 -0.088 0.033 -0.070 0.156
## Cancer -0.492 -0.088 0.234 -0.179 -0.059 -0.079 -0.323 -0.047
## Asthma -0.277 -0.068 -0.256 -0.093 0.003 0.206 0.092 0.004 -0.147
## Heart.Dises -0.092 0.092 -0.296 -0.125 0.201 -0.037 0.001 0.038 -0.578
## COPD 0.506 -0.024 0.120 0.159 0.000 0.144 0.043 0.078 -0.225
## Smoking -0.058 0.101 -0.118 -0.138 -0.108 0.148 -0.086 -0.322 0.164
## Diabetes 0.079 -0.318 -0.077 -0.140 -0.222 -0.275 -0.438 0.081 0.330
## N.Physcl.Ac -0.139 0.049 0.093 0.076 0.062 -0.266 0.009 0.102 0.351
## Obesity -0.067 0.385 0.400 0.205 0.143 0.186 -0.113 -0.180 0.136
## Pr.Slpng.Hb -0.385 -0.345 0.164 -0.325 -0.326 -0.022 -0.147 0.093 0.031
## Pr.Mntl.Hlt -0.372 0.201 -0.008 0.013 0.045 -0.166 0.028 0.113 0.419
## Testing_Rat 0.115 -0.101 0.024 -0.031 0.007 -0.078 -0.072 -0.040 -0.010
## Hsptlztn_Rt -0.136 0.066 -0.028 -0.065 -0.039 -0.011 0.007 -0.023 -0.022
## Asthma Hrt.Ds COPD Smokng Diabts N.Ph.A Obesty Pr.S.H Pr.M.H
## Affluence
## Sngltns.n.T
## Snrs.n.Trct
## Afrcn.Am..T
## Nnctzns.n.T
## High.BP
## Bing.Drnkng
## Cancer
## Asthma
## Heart.Dises 0.373
## COPD -0.382 -0.505
## Smoking 0.115 0.064 -0.445
## Diabetes -0.158 -0.440 0.041 0.287
## N.Physcl.Ac 0.004 -0.326 0.032 -0.269 -0.195
## Obesity -0.145 -0.030 0.080 -0.210 -0.370 -0.040
## Pr.Slpng.Hb 0.037 0.254 -0.111 -0.163 -0.076 -0.145 -0.124
## Pr.Mntl.Hlt -0.371 -0.048 -0.402 -0.024 0.043 -0.047 0.040 -0.092
## Testing_Rat -0.313 -0.177 0.221 0.077 0.180 -0.097 0.062 -0.147 -0.061
## Hsptlztn_Rt -0.081 0.074 -0.064 0.060 -0.065 0.054 0.039 -0.048 0.050
## Tstn_R
## Affluence
## Sngltns.n.T
## Snrs.n.Trct
## Afrcn.Am..T
## Nnctzns.n.T
## High.BP
## Bing.Drnkng
## Cancer
## Asthma
## Heart.Dises
## COPD
## Smoking
## Diabetes
## N.Physcl.Ac
## Obesity
## Pr.Slpng.Hb
## Pr.Mntl.Hlt
## Testing_Rat
## Hsptlztn_Rt 0.237
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
this.lme.sum <- summary(this.lme)